# dataframe package
import pandas as pd
import numpy as np
# plotting packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# packages for creating the map
import holoviews as hv
from datashader.utils import lnglat_to_meters
import colorcet as cc
hv.extension('bokeh')
# ML packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
# xgboost
from xgboost import XGBRegressor
housing = pd.read_csv("housing.csv")
housing.sample(10)
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9593 | -120.46 | 37.33 | 4.0 | 786.0 | 116.0 | 368.0 | 109.0 | 6.3215 | 138200.0 | INLAND |
| 8178 | -118.11 | 33.80 | 36.0 | 1680.0 | 291.0 | 744.0 | 280.0 | 4.6600 | 244800.0 | <1H OCEAN |
| 6578 | -118.18 | 34.20 | 44.0 | 1473.0 | 250.0 | 668.0 | 239.0 | 8.7200 | 415900.0 | <1H OCEAN |
| 18760 | -122.25 | 40.60 | 16.0 | 2753.0 | 494.0 | 1414.0 | 459.0 | 3.8323 | 128300.0 | INLAND |
| 16446 | -121.29 | 38.15 | 23.0 | 4183.0 | 633.0 | 1886.0 | 628.0 | 4.8787 | 175300.0 | INLAND |
| 12840 | -121.43 | 38.66 | 35.0 | 1814.0 | 367.0 | 1076.0 | 372.0 | 2.7179 | 81100.0 | INLAND |
| 839 | -122.06 | 37.60 | 17.0 | 5159.0 | 832.0 | 3174.0 | 817.0 | 5.8704 | 234400.0 | NEAR BAY |
| 5085 | -118.28 | 33.98 | 39.0 | 1306.0 | 345.0 | 1332.0 | 331.0 | 1.9564 | 92200.0 | <1H OCEAN |
| 6927 | -118.07 | 34.00 | 42.0 | 1392.0 | 351.0 | 1471.0 | 348.0 | 2.6300 | 143800.0 | <1H OCEAN |
| 13363 | -117.64 | 33.99 | 29.0 | 1005.0 | 152.0 | 513.0 | 149.0 | 2.4375 | 181300.0 | INLAND |
housing.hist(bins=50, figsize=(20,15))
plt.show()
The data shows a high level of skweness, most of the features don't follow a normal distribution
housing.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20640 entries, 0 to 20639 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 20640 non-null float64 1 latitude 20640 non-null float64 2 housing_median_age 20640 non-null float64 3 total_rooms 20640 non-null float64 4 total_bedrooms 20433 non-null float64 5 population 20640 non-null float64 6 households 20640 non-null float64 7 median_income 20640 non-null float64 8 median_house_value 20640 non-null float64 9 ocean_proximity 20640 non-null object dtypes: float64(9), object(1) memory usage: 1.6+ MB
Here we can see that we have NULL values for the total bedrooms, we can deal with these values later.
housing.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| longitude | 20640.0 | -119.569704 | 2.003532 | -124.3500 | -121.8000 | -118.4900 | -118.01000 | -114.3100 |
| latitude | 20640.0 | 35.631861 | 2.135952 | 32.5400 | 33.9300 | 34.2600 | 37.71000 | 41.9500 |
| housing_median_age | 20640.0 | 28.639486 | 12.585558 | 1.0000 | 18.0000 | 29.0000 | 37.00000 | 52.0000 |
| total_rooms | 20640.0 | 2635.763081 | 2181.615252 | 2.0000 | 1447.7500 | 2127.0000 | 3148.00000 | 39320.0000 |
| total_bedrooms | 20433.0 | 537.870553 | 421.385070 | 1.0000 | 296.0000 | 435.0000 | 647.00000 | 6445.0000 |
| population | 20640.0 | 1425.476744 | 1132.462122 | 3.0000 | 787.0000 | 1166.0000 | 1725.00000 | 35682.0000 |
| households | 20640.0 | 499.539680 | 382.329753 | 1.0000 | 280.0000 | 409.0000 | 605.00000 | 6082.0000 |
| median_income | 20640.0 | 3.870671 | 1.899822 | 0.4999 | 2.5634 | 3.5348 | 4.74325 | 15.0001 |
| median_house_value | 20640.0 | 206855.816909 | 115395.615874 | 14999.0000 | 119600.0000 | 179700.0000 | 264725.00000 | 500001.0000 |
housing.plot(kind='box',
subplots=True,
sharey=False,
figsize=(25, 15))
plt.subplots_adjust(wspace=1)
plt.show()
Looking at the boxplot for each of our numerical features we can see that most of them have huge difference in the values (outliers) this is of course due to the skweness of the data.
housing.corr(numeric_only=True)
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| longitude | 1.000000 | -0.924664 | -0.108197 | 0.044568 | 0.069608 | 0.099773 | 0.055310 | -0.015176 | -0.045967 |
| latitude | -0.924664 | 1.000000 | 0.011173 | -0.036100 | -0.066983 | -0.108785 | -0.071035 | -0.079809 | -0.144160 |
| housing_median_age | -0.108197 | 0.011173 | 1.000000 | -0.361262 | -0.320451 | -0.296244 | -0.302916 | -0.119034 | 0.105623 |
| total_rooms | 0.044568 | -0.036100 | -0.361262 | 1.000000 | 0.930380 | 0.857126 | 0.918484 | 0.198050 | 0.134153 |
| total_bedrooms | 0.069608 | -0.066983 | -0.320451 | 0.930380 | 1.000000 | 0.877747 | 0.979728 | -0.007723 | 0.049686 |
| population | 0.099773 | -0.108785 | -0.296244 | 0.857126 | 0.877747 | 1.000000 | 0.907222 | 0.004834 | -0.024650 |
| households | 0.055310 | -0.071035 | -0.302916 | 0.918484 | 0.979728 | 0.907222 | 1.000000 | 0.013033 | 0.065843 |
| median_income | -0.015176 | -0.079809 | -0.119034 | 0.198050 | -0.007723 | 0.004834 | 0.013033 | 1.000000 | 0.688075 |
| median_house_value | -0.045967 | -0.144160 | 0.105623 | 0.134153 | 0.049686 | -0.024650 | 0.065843 | 0.688075 | 1.000000 |
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)
median_house_value 1.000000 median_income 0.688075 total_rooms 0.134153 housing_median_age 0.105623 households 0.065843 total_bedrooms 0.049686 population -0.024650 longitude -0.045967 latitude -0.144160 Name: median_house_value, dtype: float64
attributes = ["median_income", "total_rooms", "housing_median_age", "latitude", "median_house_value" ]
pd.plotting.scatter_matrix(housing[attributes], figsize=(25,20))
plt.show()
plt.figure(figsize=(15, 10))
sns.heatmap(housing.corr(numeric_only=True), cmap="coolwarm", annot=True).set_title('Correlation Heatmap')
plt.show()
After looking at the correlation matrix and the heatmap of the matrix, we can see that the meadian house value is mainly affected by the median income of the family. This value has the highest impact on the house value
Another thing to note here, is the high number of correlated values, whether they are proportionate or inversely proportionate.
housing_plot = housing.copy()
housing_plot["longitude"], housing_plot["latitude"] = lnglat_to_meters(housing["longitude"], housing["latitude"])
housing_plot.head()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.360658e+07 | 4.562488e+06 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY |
| 1 | -1.360547e+07 | 4.559667e+06 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY |
| 2 | -1.360769e+07 | 4.558257e+06 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY |
| 3 | -1.360881e+07 | 4.558257e+06 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY |
| 4 | -1.360881e+07 | 4.558257e+06 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY |
esri = hv.element.tiles.ESRI().redim(x="Longtitude", y="Latitude").opts(alpha=0.3, bgcolor='black', width=1000, height=700)
points = hv.Points(housing_plot, kdims=["longitude", "latitude"]).opts(color="median_house_value", cmap="jet", colorbar=True, tools=["hover"])
esri * points
The above insights give us an idication of how the median income and the ocean proximity affect the price of the house, now it's time to start data processing.
First, I'll convert the ocean proximity into a category type
housing["ocean_proximity"] = housing["ocean_proximity"].astype("category")
Secondly, let's fill in the NULL values using the average value in the total bedroom columns
housing["total_bedrooms"].fillna(housing["total_bedrooms"].mean(), inplace=True)
We can introduce more features into the dataset that might help us better predict the value of the house
housing["rooms_per_household"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing["total_rooms"]
housing["population_per_household"]=housing["population"] / housing["households"]
housing.head()
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | ocean_proximity | rooms_per_household | bedrooms_per_room | population_per_household | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -122.23 | 37.88 | 41.0 | 880.0 | 129.0 | 322.0 | 126.0 | 8.3252 | 452600.0 | NEAR BAY | 6.984127 | 0.146591 | 2.555556 |
| 1 | -122.22 | 37.86 | 21.0 | 7099.0 | 1106.0 | 2401.0 | 1138.0 | 8.3014 | 358500.0 | NEAR BAY | 6.238137 | 0.155797 | 2.109842 |
| 2 | -122.24 | 37.85 | 52.0 | 1467.0 | 190.0 | 496.0 | 177.0 | 7.2574 | 352100.0 | NEAR BAY | 8.288136 | 0.129516 | 2.802260 |
| 3 | -122.25 | 37.85 | 52.0 | 1274.0 | 235.0 | 558.0 | 219.0 | 5.6431 | 341300.0 | NEAR BAY | 5.817352 | 0.184458 | 2.547945 |
| 4 | -122.25 | 37.85 | 52.0 | 1627.0 | 280.0 | 565.0 | 259.0 | 3.8462 | 342200.0 | NEAR BAY | 6.281853 | 0.172096 | 2.181467 |
housing.corr(numeric_only=True)
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | rooms_per_household | bedrooms_per_room | population_per_household | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| longitude | 1.000000 | -0.924664 | -0.108197 | 0.044568 | 0.069260 | 0.099773 | 0.055310 | -0.015176 | -0.045967 | -0.027540 | 0.076231 | 0.002476 |
| latitude | -0.924664 | 1.000000 | 0.011173 | -0.036100 | -0.066658 | -0.108785 | -0.071035 | -0.079809 | -0.144160 | 0.106389 | -0.092352 | 0.002366 |
| housing_median_age | -0.108197 | 0.011173 | 1.000000 | -0.361262 | -0.318998 | -0.296244 | -0.302916 | -0.119034 | 0.105623 | -0.153277 | 0.130709 | 0.013191 |
| total_rooms | 0.044568 | -0.036100 | -0.361262 | 1.000000 | 0.927253 | 0.857126 | 0.918484 | 0.198050 | 0.134153 | 0.133798 | -0.180316 | -0.024581 |
| total_bedrooms | 0.069260 | -0.066658 | -0.318998 | 0.927253 | 1.000000 | 0.873910 | 0.974725 | -0.007682 | 0.049454 | 0.001535 | 0.069660 | -0.028354 |
| population | 0.099773 | -0.108785 | -0.296244 | 0.857126 | 0.873910 | 1.000000 | 0.907222 | 0.004834 | -0.024650 | -0.072213 | 0.005063 | 0.069863 |
| households | 0.055310 | -0.071035 | -0.302916 | 0.918484 | 0.974725 | 0.907222 | 1.000000 | 0.013033 | 0.065843 | -0.080598 | 0.027430 | -0.027309 |
| median_income | -0.015176 | -0.079809 | -0.119034 | 0.198050 | -0.007682 | 0.004834 | 0.013033 | 1.000000 | 0.688075 | 0.326895 | -0.511031 | 0.018766 |
| median_house_value | -0.045967 | -0.144160 | 0.105623 | 0.134153 | 0.049454 | -0.024650 | 0.065843 | 0.688075 | 1.000000 | 0.151948 | -0.220049 | -0.023737 |
| rooms_per_household | -0.027540 | 0.106389 | -0.153277 | 0.133798 | 0.001535 | -0.072213 | -0.080598 | 0.326895 | 0.151948 | 1.000000 | -0.347501 | -0.004852 |
| bedrooms_per_room | 0.076231 | -0.092352 | 0.130709 | -0.180316 | 0.069660 | 0.005063 | 0.027430 | -0.511031 | -0.220049 | -0.347501 | 1.000000 | 0.002373 |
| population_per_household | 0.002476 | 0.002366 | 0.013191 | -0.024581 | -0.028354 | 0.069863 | -0.027309 | 0.018766 | -0.023737 | -0.004852 | 0.002373 | 1.000000 |
It's time to convert the categorical data into numerical
housing = pd.get_dummies(housing, columns=["ocean_proximity"])
Now it's the time to scale the features before we feed them to our ML model
X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X.sample(10)
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | rooms_per_household | bedrooms_per_room | population_per_household | ocean_proximity_<1H OCEAN | ocean_proximity_INLAND | ocean_proximity_ISLAND | ocean_proximity_NEAR BAY | ocean_proximity_NEAR OCEAN | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18334 | 0.218127 | 0.521785 | 0.705882 | 0.074368 | 0.135475 | 0.038118 | 0.133860 | 0.282672 | 0.019453 | 0.073335 | 0.000789 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2468 | 0.444223 | 0.435707 | 0.647059 | 0.036116 | 0.036623 | 0.019984 | 0.036343 | 0.170694 | 0.039410 | 0.035027 | 0.002038 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 2159 | 0.452191 | 0.449522 | 0.921569 | 0.045857 | 0.050900 | 0.020684 | 0.054267 | 0.143481 | 0.032659 | 0.039555 | 0.001244 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 18159 | 0.231076 | 0.512221 | 0.529412 | 0.063279 | 0.053383 | 0.026486 | 0.059201 | 0.413194 | 0.042898 | 0.026871 | 0.001556 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 8538 | 0.596614 | 0.143464 | 0.647059 | 0.019279 | 0.026847 | 0.020180 | 0.032396 | 0.332202 | 0.021212 | 0.053097 | 0.002381 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 10705 | 0.660359 | 0.113709 | 0.490196 | 0.051656 | 0.071695 | 0.017237 | 0.073837 | 0.142660 | 0.026028 | 0.052747 | 0.000548 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 1685 | 0.206175 | 0.579171 | 0.529412 | 0.096622 | 0.096369 | 0.046274 | 0.093735 | 0.267245 | 0.041191 | 0.034150 | 0.001774 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 13790 | 0.676295 | 0.199787 | 0.098039 | 0.108398 | 0.116077 | 0.056111 | 0.109357 | 0.204797 | 0.039388 | 0.037636 | 0.001866 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 10602 | 0.651394 | 0.120085 | 0.137255 | 0.062007 | 0.077747 | 0.031111 | 0.079263 | 0.282893 | 0.029814 | 0.046363 | 0.001297 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 5990 | 0.659363 | 0.165781 | 0.705882 | 0.087873 | 0.083313 | 0.037585 | 0.086992 | 0.371664 | 0.040241 | 0.031813 | 0.001484 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
y.sample(10)
18621 261500.0 11388 294900.0 13541 86100.0 7157 156300.0 20630 112000.0 9421 478600.0 9431 240900.0 13116 175700.0 7387 112000.0 6832 245700.0 Name: median_house_value, dtype: float64
X.rename(columns={"ocean_proximity_<1H OCEAN": "ocean_proximity_1H OCEAN"}, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
Now, it's time to make some predictions by creating a ML model.
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
evs = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse:.2f}, Explained Variance Score: {evs:.3f}, Coefficient of Determination: {r2:.3f}")
Root Mean Squared Error: 71761.59, Explained Variance Score: 0.614, Coefficient of Determination: 0.614
First, I'll create a model using some hyperparameter, then I'll try to do some hyperparameter tuning
xgmodel = XGBRegressor(n_estimators=1000,
learning_rate=0.08,
gamma=0,
subsample=0.75,
colsample_bytree=1,
max_depth=7
)
xgmodel.fit(X_train, y_train)
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.08, max_bin=256,
max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
max_depth=7, max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=1000, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.08, max_bin=256,
max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
max_depth=7, max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=1000, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0, ...)y_pred = xgmodel.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
evs = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse:.2f}, Explained Variance Score: {evs:.3f}, Coefficient of Determination: {r2:.3f}")
Root Mean Squared Error: 45231.57, Explained Variance Score: 0.847, Coefficient of Determination: 0.846
Here we can see the huge improvment in the accuracy of our model, There might still be room for improvment by doing some hyperparameter-tuning
params = {'max_depth': [3,6,10],
'learning_rate': [0.01, 0.08, 0.1],
'n_estimators': [100, 500, 1000],
'colsample_bytree': [0.3, 0.7, 1],
'subsample':[0.3, 0.75, 1]
}
clf = GridSearchCV(estimator=xgmodel,
param_grid=params,
scoring='neg_mean_squared_error',
verbose=10)
clf.fit(X_train, y_train)
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
GridSearchCV(estimator=XGBRegressor(base_score=0.5, booster='gbtree',
callbacks=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise',
importance_type=None,
interaction_constraints='',
learning_rate=0.08, max_bin=256,...
max_delta_step=0, max_depth=7, max_leaves=0,
min_child_weight=1, missing=nan,
monotone_constraints='()',
n_estimators=1000, n_jobs=0,
num_parallel_tree=1, predictor='auto',
random_state=0, ...),
param_grid={'colsample_bytree': [0.3, 0.7, 1], 'gamma': [0],
'learning_rate': [0.01, 0.08, 0.1],
'max_depth': [3, 6, 10],
'n_estimators': [100, 500, 1000],
'subsample': [0.3, 0.75, 1]},
scoring='neg_mean_squared_error', verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=XGBRegressor(base_score=0.5, booster='gbtree',
callbacks=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise',
importance_type=None,
interaction_constraints='',
learning_rate=0.08, max_bin=256,...
max_delta_step=0, max_depth=7, max_leaves=0,
min_child_weight=1, missing=nan,
monotone_constraints='()',
n_estimators=1000, n_jobs=0,
num_parallel_tree=1, predictor='auto',
random_state=0, ...),
param_grid={'colsample_bytree': [0.3, 0.7, 1], 'gamma': [0],
'learning_rate': [0.01, 0.08, 0.1],
'max_depth': [3, 6, 10],
'n_estimators': [100, 500, 1000],
'subsample': [0.3, 0.75, 1]},
scoring='neg_mean_squared_error', verbose=1)XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.08, max_bin=256,
max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
max_depth=7, max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=1000, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0, ...)XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.08, max_bin=256,
max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
max_depth=7, max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=1000, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0, ...)print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))
Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 1000, 'subsample': 0.75}
Lowest RMSE: 45441.33175988161
y_pred = clf.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
evs = explained_variance_score(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error: {rmse:.2f}, Explained Variance Score: {evs:.3f}, Coefficient of Determination: {r2:.3f}")
Root Mean Squared Error: 44408.17, Explained Variance Score: 0.852, Coefficient of Determination: 0.852
We can see that the tuning has increased the accuracy of our model a bit better. Depending on our business, we can determine if we need to try to improve the accuracy or if that enough for our business problem.